1 Introduction

Multiples factors play an important function during the naive CD4-T cell differentiation where Blimp1 (encoded by Prdm1) and Bcl6 are the masters. In this notebook, we are going to characterize the accessibility pattern of these regulators.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(GenomicRanges)
library(pheatmap)
library(JASPAR2020)
library(TFBSTools)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(data.table)
library(ggpubr)
library(writexl)
library(plyr)
library(stringr)
library(rio)

2.2 Parameters

cell_type <- "CD4_T"

path_to_obj <- paste0(
  here::here("scATAC-seq/results/R_objects/level_5/"),
  cell_type,
  "/04.",
  cell_type,
  "_integration_peak_calling_level_5.rds",
  sep = ""
)

path_to_RNA <- paste0(
  here::here("scRNA-seq/3-clustering/5-level_5/"),
  cell_type,
  "/",
  cell_type,
  "_subseted_annotated_level_5.rds",
  sep = ""
)

upstream <- 2000

color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", 
                    "#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
                    "#1C8356", "#FEAF16", "#822E1C", "#C4451C", 
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
                    "#FBE426", "#16FF32",  "black",  "#3283FE",
                    "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3", 
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", 
                    "#822E1C", "coral2",  "#1CFFCE", "#1CBE4F", 
                    "#3283FE", "#FBE426", "#F7E1A0", "#325A9B", 
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
                    "#FEAF16", "#683B79", "#B10DA1", "#1C7F93", 
                    "#F8A19F", "dark orange", "#FEAF16", 
                    "#FBE426", "Brown")

2.3 Functions

plot_dim <- function(seurat, group){
  DimPlot(seurat, 
  group.by = group,
  cols = color_palette,
  pt.size = 0.1,raster=FALSE)
}

mat_heatmap <- function(seurat, features, group,
                        cutree_ncols,cutree_nrows){
  
my_sample_col <- data.frame(Group = rep(c("Naive", "Central Memory",
                                          "Central Memory","Non-Tfh",
                                          "Non-Tfh","Non-Tfh",
                                          "Non-Tfh","Non-Tfh",
                                          "Non-Tfh","Tfh",
                                          "Tfh","Tfh",
                                          "Tfh","Tfh")))

rownames(my_sample_col) = c("Naive", "CM Pre-non-Tfh","CM PreTfh",
                            "T-Trans-Mem","T-Eff-Mem","T-helper",
                            "Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs" ,
                            "B border_Tfh T", "Tfh-LZ-GC",
                            "GC-Tfh-SAP","GC-Tfh-0X40","Tfh-Mem")

annoCol<-list(Group=c("Naive"="grey", "Central Memory"="black", 
                      "Non-Tfh"="red", "Tfh"="yellow"))

avgexpr_mat <- AverageExpression(
features = features,
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = group,
slot = "data")

p1 <- pheatmap(avgexpr_mat$peaks_level_5[,c(rownames(my_sample_col))], 
         scale = "row",
        angle_col = 45,
        show_rownames=T,
        annotation_col = my_sample_col,
        annotation_colors = annoCol,
         border_color = "white",
         cluster_rows = T,
         cluster_cols = F,
         fontsize_col = 10,
         clustering_distance_rows = "euclidean",
         clustering_method = "ward.D2",
         cutree_rows = cutree_nrows) 
return(p1)}

3 Load data

3.1 Gene reference data

gene_reference <-  here::here("scATAC-seq/Cicero/genes.gtf")
gtf <- rtracklayer::import(gene_reference)
gtf_df <- as.data.frame(gtf)

gtf_df_gene <- gtf_df[gtf_df$type == "gene",]
DT::datatable(gtf_df_gene)
# We are going to add 2000 bp upstream of each gene to take in acccount the proximal promoter regions
gtf_df_gene$start <- gtf_df_gene$start - upstream

3.2 CD4-T cells data

seurat <- readRDS(path_to_obj)
seurat_peaks <- seurat@assays$peaks_level_5@ranges
seurat
## An object of class Seurat 
## 93116 features across 16383 samples within 1 assay 
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
plot_dim(seurat, group = "annotation_paper") 

seurat_RNA <- readRDS(path_to_RNA)
seurat_RNA
## An object of class Seurat 
## 37378 features across 52307 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony
plot_dim(seurat_RNA, group = "annotation_paper") 

3.2.1 Grouping the cells in Non-Tfh & Tfh groups

At low level of resolution, we want to detect the main epigenomic changes between Non-Tfh vs Tfh. For this reason, we decide to group the cells in 4 clusters: Non-Tfh, Tfh, Central Memory and Naive.

seurat@meta.data <- seurat@meta.data %>% mutate(Group =
  case_when(annotation_paper == "Naive" ~ "Naive",
    annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
    annotation_paper == "CM PreTfh" ~ "Central Memory",
    annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
    annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
    annotation_paper == "T-helper" ~ "Non-Tfh",
    annotation_paper == "Tfh T:B border" ~ "Tfh",
    annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
    annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
    annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
    annotation_paper == "Tfh-Mem" ~ "Tfh",
    annotation_paper == "Memory T cells" ~ "Non-Tfh",
    annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
    annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
    annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))

plot_dim(seurat, group = "Group") 

seurat_RNA@meta.data <- seurat_RNA@meta.data %>% mutate(Group =
  case_when(annotation_paper == "Naive" ~ "Naive",
    annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
    annotation_paper == "CM PreTfh" ~ "Central Memory",
    annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
    annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
    annotation_paper == "T-helper" ~ "Non-Tfh",
    annotation_paper == "Tfh T:B border" ~ "Tfh",
    annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
    annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
    annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
    annotation_paper == "Tfh-Mem" ~ "Tfh",
    annotation_paper == "Memory T cells" ~ "Non-Tfh",
    annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
    annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
    annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))

plot_dim(seurat_RNA, group = "Group") 

4 Finding differentially expressed genes (DE)

The main goal of this analisys is to extract the differential expressed genes between the groups previously defined.

Idents(seurat_RNA) <- "Group"

#DE <- FindAllMarkers(
 # object = seurat_RNA,
  #logfc.threshold = 0.25,
  #test.use = "wilcox")

#output <- split(DE, DE$cluster)
path_to_save_DE <- here::here("scATAC-seq/results/files/CD4_T/DE_4_groups.xlsx")
#write_xlsx(output, path_to_save_DE)

#DT::datatable(DE)

4.1 Manual selection of genes candidates.

DE <- import_list(path_to_save_DE, setclass = "tbl", rbind = TRUE)

colnames(DE) <- c("p_val", "avg_log2FC", 
                  "pct.1" , "pct.2",  "p_val_adj",
                  "cluster", "gene_name", "_file")
Tfh_genes <- c("TOX2", "PDCD1","CXCL13","TOX","BCL6",
              "GNG4","IL21","SH2D1A","CD200","CXCR5","POU2AF1")

Naive_genes <- c("BACH2","LEF1","CCR7","NOSIP","KLF2","SELL")

Central_memory_genes <- c("ANK3","IL7R","TXNIP","ANXA1",
                          "ZBTB16","GPR183","TIGIT","IL21")

Non_Tfh_genes <- c("LAG3","RORA","IKZF2","KLRB1",
                   "IL2RA","PRDM1","IL1R1","CTLA4",
                   "FOXP3","CCR6","MAF","CCL20","IL1R2") 

target_genes <- c(Naive_genes,Central_memory_genes,
                  Tfh_genes,Non_Tfh_genes)

5 Study the chromatin dynamics in the DE genes

5.1 Extraction of the DE genes coordinates

expr_mat <- AverageExpression(
features = target_genes,
seurat_RNA,
return.seurat = F,
group.by = "Group",
slot = "data")

pheatmap(expr_mat$RNA[target_genes,],
   scale = "row",
   annotation_names_row = F,
   border_color = "white",
   cluster_rows = F,
   cluster_cols = T,
   fontsize_row = 10,
   clustering_distance_rows = "euclidean",
   clustering_distance_cols = "euclidean", 
   clustering_method = "ward.D2") 

#write.table(unique(expr_mat$RNA[target_genes,]), 
 #           quote = F, 
  #          here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_RNA_genes.tsv"))

6 Study the chromatin dynamics in the DE genes

6.1 Extraction of the DE genes coordinates

all_genes <- gtf_df_gene[which(gtf_df_gene$gene_name %in% target_genes),]
all_genes_GR <- makeGRangesFromDataFrame(all_genes[c(1:5,13)],
                                         keep.extra.columns = T)
all_genes_GR
## GRanges object with 37 ranges and 1 metadata column:
##           seqnames              ranges strand |   gene_name
##              <Rle>           <IRanges>  <Rle> | <character>
##    141854     chr1 145990435-145996579      - |       TXNIP
##    188313     chr1 169688665-169711702      - |        SELL
##    243943     chr1 235545685-235650754      - |        GNG4
##    339680     chr2 101989960-102028544      + |       IL1R2
##    339786     chr2 102062544-102179874      + |       IL1R1
##       ...      ...                 ...    ... .         ...
##   2409954    chr19   16322817-16327874      + |        KLF2
##   2485933    chr19   49553468-49590262      - |       NOSIP
##   2557042    chr20   43912852-44069616      + |        TOX2
##   2702697     chrX   49248436-49270477      - |       FOXP3
##   2737879     chrX 124225868-124373197      + |      SH2D1A
##   -------
##   seqinfo: 40 sequences from an unspecified genome; no seqlengths
## Overlapping of the DE genes coordinates with the total number of peaks detected.
gr1 <- seurat_peaks
gr2 <- all_genes_GR
m <- findOverlaps(gr1, gr2)
gr1.matched <- gr1[queryHits(m)]
# Add the metadata from gr2
mcols(gr1.matched) <- cbind.data.frame(
    mcols(gr1.matched),
    mcols(gr2[subjectHits(m)]));

gr1.matched
## GRanges object with 515 ranges and 1 metadata column:
##         seqnames              ranges strand |   gene_name
##            <Rle>           <IRanges>  <Rle> | <character>
##     [1]     chr1 145996409-145997493      * |       TXNIP
##     [2]     chr1 169692335-169693366      * |        SELL
##     [3]     chr1 169694288-169694677      * |        SELL
##     [4]     chr1 169699061-169699292      * |        SELL
##     [5]     chr1 169700944-169701648      * |        SELL
##     ...      ...                 ...    ... .         ...
##   [511]     chrX 124328812-124329858      * |      SH2D1A
##   [512]     chrX 124339089-124339448      * |      SH2D1A
##   [513]     chrX 124342323-124342811      * |      SH2D1A
##   [514]     chrX 124346063-124346707      * |      SH2D1A
##   [515]     chrX 124358836-124359410      * |      SH2D1A
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
gr1.matched$peaks <- paste0(seqnames(gr1.matched),"-",
                             start(gr1.matched),"-",
                             end(gr1.matched))

gr1.matched_df <- as.data.frame(gr1.matched)

my_sample_col <- data.frame(Gene = c(gr1.matched$gene_name))
rownames(my_sample_col) = unique(gr1.matched$peaks)

avgexpr_mat <- AverageExpression(
features = unique(gr1.matched$peaks),
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = "Group",
slot = "data")

avgexpr_df <- as.data.frame(avgexpr_mat$peaks_level_5)
avgexpr_df$peaks <- row.names(avgexpr_df)

DA_DE_merge <- merge(avgexpr_df,
                       gr1.matched_df[c("peaks","gene_name")],
                       by=c("peaks"))  

DA_DE_merge_melt <- melt(DA_DE_merge)

# Computing the mean accessibility/expression per gene 
mean_accessibility <- tapply(DA_DE_merge_melt$value,
                               list(DA_DE_merge_melt$gene_name, DA_DE_merge_melt$variable),
                                    mean)


out <- pheatmap(mean_accessibility[target_genes,],
         scale = "row",
         border_color = "white",
         cluster_rows = F,
         cluster_cols = T,
         fontsize_row = 10,
         clustering_distance_cols = "euclidean", 
         clustering_method = "ward.D2")

#write.table(unique(mean_accessibility[target_genes,]), 
 #           quote = F, 
  #          here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_ATAC_genes.tsv"))
    
DA_DE_merge <- merge(melt(mean_accessibility),
                     melt(expr_mat$RNA),
                           by=c("Var1","Var2"))   

colnames(DA_DE_merge) <- c("Gene" ,"Clusters","Accesibility","Expresion")
DA_DE_merge.melt <- melt(DA_DE_merge)

# Filtering conditions
df_Naive  <- filter(
  DA_DE_merge.melt,Clusters == "Naive" & Gene %in% Naive_genes)
df_CM  <- filter(
  DA_DE_merge.melt,Clusters == "Central Memory" & Gene %in% Central_memory_genes)
df_Tfh  <- filter(
  DA_DE_merge.melt,Clusters == "Tfh" & Gene %in% Tfh_genes)
df_Non_Tfn  <- filter(
  DA_DE_merge.melt,Clusters == "Non-Tfh" & Gene %in% Non_Tfh_genes)

selection_df <- rbind(df_Naive,df_CM,df_Tfh,df_Non_Tfn)
selection_df$value <- scale(selection_df$value)
  
 ggdotchart(selection_df, 
            x="Gene", 
            y="value", 
            add = "segments") +
        coord_flip() + facet_grid(vars(Clusters), vars(variable), scales = "free_y")

7 Master regulators: BCL6 and PRDM1

7.1 BCL6

bcl6 <- gtf_df_gene[which(gtf_df_gene$gene_name %in% "BCL6"),]
bcl6_gr <- makeGRangesFromDataFrame(bcl6)

bcl6_plot <- CoveragePlot(
  object = seurat,
  region = bcl6_gr)

bcl6_plot

overlapping_bcl6 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, 
                                                        bcl6_gr)),]

features <- paste0(seqnames(overlapping_bcl6),"-",
                   start(overlapping_bcl6),"-",
                   end(overlapping_bcl6))

#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/bcl6_heatmaps.pdf"), 
 #   width = 10, 
  #  height = 4)

print(mat_heatmap(seurat = seurat, 
            features = features, 
            group = "annotation_paper",
            cutree_ncols = 3,cutree_nrows = 1))

#dev.off()

7.2 PRDM1

prdm1 <- gtf_df_gene[which(gtf_df_gene$gene_name %in% "PRDM1"),]
prdm1_gr <- makeGRangesFromDataFrame(prdm1)

prdm1_plot <-CoveragePlot(
  object = seurat,
  region = prdm1_gr)

prdm1_plot

overlapping_prdm1 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, prdm1_gr)),]

features <- paste0(seqnames(overlapping_prdm1),"-",
                   start(overlapping_prdm1),"-",
                   end(overlapping_prdm1))

#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/prdm1_heatmaps.pdf"), 
 #   width = 10, 
  #  height = 4)

print(mat_heatmap(seurat = seurat, 
            features = features, 
            group = "annotation_paper",
            cutree_ncols = 3,cutree_nrows = 1))

#dev.off()
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rio_0.5.16           plyr_1.8.6           writexl_1.4.0        ggpubr_0.4.0         data.table_1.14.0    forcats_0.5.0        stringr_1.4.0        purrr_0.3.4          readr_1.4.0          tidyr_1.1.3          tibble_3.1.2         tidyverse_1.3.0      ggplot2_3.3.5        dplyr_1.0.7          TFBSTools_1.26.0     JASPAR2020_0.99.10   pheatmap_1.0.12      GenomicRanges_1.40.0 GenomeInfoDb_1.24.2  IRanges_2.22.1       S4Vectors_0.26.0     BiocGenerics_0.34.0  Signac_1.2.1         SeuratObject_4.0.2   Seurat_4.0.3         BiocStyle_2.16.1    
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1                  reticulate_1.20             R.utils_2.10.1              tidyselect_1.1.1            poweRlaw_0.70.6             RSQLite_2.2.1               AnnotationDbi_1.50.3        htmlwidgets_1.5.3           grid_4.0.3                  docopt_0.7.1                BiocParallel_1.22.0         Rtsne_0.15                  munsell_0.5.0               codetools_0.2-17            ica_1.0-2                   DT_0.16                     future_1.21.0               miniUI_0.1.1.1              withr_2.4.2                 colorspace_2.0-2            Biobase_2.48.0              knitr_1.30                  rstudioapi_0.11             ROCR_1.0-11                 ggsignif_0.6.0              tensor_1.5                  listenv_0.8.0               labeling_0.4.2              slam_0.1-47                 GenomeInfoDbData_1.2.3      polyclip_1.10-0             bit64_4.0.5                 farver_2.1.0                rprojroot_2.0.2             parallelly_1.26.1           vctrs_0.3.8                 generics_0.1.0              xfun_0.18                   lsa_0.73.2                  ggseqlogo_0.1               R6_2.5.0                    bitops_1.0-7               
##  [43] spatstat.utils_2.2-0        DelayedArray_0.14.0         assertthat_0.2.1            promises_1.2.0.1            scales_1.1.1                gtable_0.3.0                globals_0.14.0              goftest_1.2-2               seqLogo_1.54.3              rlang_0.4.11                RcppRoll_0.3.0              splines_4.0.3               rstatix_0.6.0               rtracklayer_1.48.0          lazyeval_0.2.2              broom_0.7.2                 spatstat.geom_2.2-0         modelr_0.1.8                BiocManager_1.30.10         yaml_2.2.1                  reshape2_1.4.4              abind_1.4-5                 crosstalk_1.1.1             backports_1.1.10            httpuv_1.6.1                tools_4.0.3                 bookdown_0.21               ellipsis_0.3.2              spatstat.core_2.2-0         RColorBrewer_1.1-2          ggridges_0.5.3              Rcpp_1.0.6                  zlibbioc_1.34.0             RCurl_1.98-1.2              rpart_4.1-15                deldir_0.2-10               pbapply_1.4-3               cowplot_1.1.1               zoo_1.8-9                   haven_2.3.1                 SummarizedExperiment_1.18.1 ggrepel_0.9.1              
##  [85] cluster_2.1.0               here_1.0.1                  fs_1.5.0                    magrittr_2.0.1              scattermore_0.7             openxlsx_4.2.4              reprex_0.3.0                lmtest_0.9-38               RANN_2.6.1                  SnowballC_0.7.0             fitdistrplus_1.1-5          matrixStats_0.59.0          hms_0.5.3                   patchwork_1.1.1             mime_0.11                   evaluate_0.14               xtable_1.8-4                XML_3.99-0.3                readxl_1.3.1                sparsesvd_0.2               gridExtra_2.3               compiler_4.0.3              KernSmooth_2.23-17          crayon_1.4.1                R.oo_1.24.0                 htmltools_0.5.1.1           mgcv_1.8-33                 later_1.2.0                 lubridate_1.7.9             DBI_1.1.0                   tweenr_1.0.1                dbplyr_1.4.4                MASS_7.3-53                 car_3.0-10                  Matrix_1.3-4                cli_3.0.0                   R.methodsS3_1.8.1           igraph_1.2.6                pkgconfig_2.0.3             GenomicAlignments_1.24.0    TFMPvalue_0.0.8             foreign_0.8-80             
## [127] plotly_4.9.4.1              spatstat.sparse_2.0-0       xml2_1.3.2                  annotate_1.66.0             DirichletMultinomial_1.30.0 XVector_0.28.0              rvest_0.3.6                 digest_0.6.27               sctransform_0.3.2           RcppAnnoy_0.0.18            pracma_2.2.9                CNEr_1.24.0                 spatstat.data_2.1-0         Biostrings_2.56.0           cellranger_1.1.0            rmarkdown_2.5               leiden_0.3.8                fastmatch_1.1-0             uwot_0.1.10                 curl_4.3.2                  shiny_1.6.0                 Rsamtools_2.4.0             gtools_3.9.2                lifecycle_1.0.0             nlme_3.1-150                jsonlite_1.7.2              carData_3.0-4               viridisLite_0.4.0           BSgenome_1.56.0             fansi_0.5.0                 pillar_1.6.1                lattice_0.20-41             KEGGREST_1.28.0             fastmap_1.1.0               httr_1.4.2                  survival_3.2-7              GO.db_3.11.4                glue_1.4.2                  zip_2.1.1                   qlcMatrix_0.9.7             png_0.1-7                   bit_4.0.4                  
## [169] ggforce_0.3.2               stringi_1.6.2               blob_1.2.1                  caTools_1.18.2              memoise_1.1.0               irlba_2.3.3                 future.apply_1.7.0